{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "506fab2a-a50c-42bd-a106-c83a9d2828ea",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "--2024-06-13 13:53:24--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py\n",
      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...\n",
      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 3832 (3.7K) [text/plain]\n",
      "Saving to: 'minsearch.py'\n",
      "\n",
      "     0K ...                                                   100%  579K=0.006s\n",
      "\n",
      "2024-06-13 13:53:24 (579 KB/s) - 'minsearch.py' saved [3832/3832]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!rm -f minsearch.py\n",
    "!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "3ac947de-effd-4b61-8792-a6d7a133f347",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<minsearch.Index at 0x1d9c9bd8890>"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import requests \n",
    "import minsearch\n",
    "\n",
    "docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'\n",
    "docs_response = requests.get(docs_url)\n",
    "documents_raw = docs_response.json()\n",
    "\n",
    "documents = []\n",
    "\n",
    "for course in documents_raw:\n",
    "    course_name = course['course']\n",
    "\n",
    "    for doc in course['documents']:\n",
    "        doc['course'] = course_name\n",
    "        documents.append(doc)\n",
    "\n",
    "index = minsearch.Index(\n",
    "    text_fields=[\"question\", \"text\", \"section\"],\n",
    "    keyword_fields=[\"course\"]\n",
    ")\n",
    "\n",
    "index.fit(documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "8f087272-b44d-4738-9ea2-175ec63a058b",
   "metadata": {},
   "outputs": [],
   "source": [
    "def search(query):\n",
    "    boost = {'question': 3.0, 'section': 0.5}\n",
    "\n",
    "    results = index.search(\n",
    "        query=query,\n",
    "        filter_dict={'course': 'data-engineering-zoomcamp'},\n",
    "        boost_dict=boost,\n",
    "        num_results=5\n",
    "    )\n",
    "\n",
    "    return results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "742ab881-499a-4675-83c4-2013ea1377b9",
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_prompt(query, search_results):\n",
    "    prompt_template = \"\"\"\n",
    "You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.\n",
    "Use only the facts from the CONTEXT when answering the QUESTION.\n",
    "\n",
    "QUESTION: {question}\n",
    "\n",
    "CONTEXT: \n",
    "{context}\n",
    "\"\"\".strip()\n",
    "\n",
    "    context = \"\"\n",
    "    \n",
    "    for doc in search_results:\n",
    "        context = context + f\"section: {doc['section']}\\nquestion: {doc['question']}\\nanswer: {doc['text']}\\n\\n\"\n",
    "    \n",
    "    prompt = prompt_template.format(question=query, context=context).strip()\n",
    "    return prompt\n",
    "\n",
    "def llm(prompt):\n",
    "    response = client.chat.completions.create(\n",
    "        model='gpt-4o',\n",
    "        messages=[{\"role\": \"user\", \"content\": prompt}]\n",
    "    )\n",
    "    \n",
    "    return response.choices[0].message.content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "fe8bff3e-b672-42be-866b-f2d9bb217106",
   "metadata": {},
   "outputs": [],
   "source": [
    "def rag(query):\n",
    "    search_results = search(query)\n",
    "    prompt = build_prompt(query, search_results)\n",
    "    answer = llm(prompt)\n",
    "    return answer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "988ece59-951a-4b32-ba3f-cb8efb66a9bb",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
